Libraries

library(tidyverse)
library(sf)
library(raster)

Load Data

Background image

# Load the merged Satellite image for Kazanlak Valley, Bulgaria
# use cds-spatial repo from github https://github.com/CDS-AU-DK/cds-spatial
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")

crs(kaz)
## CRS arguments:
##  +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs

Prediction data

# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows

# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
        main = "Probability of mound present in a satellite image gridcell");abline(v=0.6, col="red", lty = 2, lwd = 3)

table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))
## 
##   (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8] 
##      7865      7710      6983      4757      2243       653       195        58 
## (0.8,0.9]   (0.9,1] 
##        30        10
# ok, so there is an exponential drop-off. 

Make the predictions to sf grid

Let’s make the predictions to a grid so we can assess whether gridcells with higher probabilities of mound presence in fact contain burial mounds. To keep the grid lightweight, let’s start with those gridcells that have values of mound probability at 60% and higher.

### Build a grid of those with 60%+ probability of containing a mound

# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")

# Filter predictions to those that have 60+% likelihood of containing a mound
cnn60_sp <- cnn_df %>% 
  filter(mound_probability > 0.59) %>%  #333 observations
  st_as_sf(coords = c("x","y"), crs = 32635)

# Make a grid of 60%+ cells, 382 cells, 332 unique ones
cnn_grid <- st_make_grid(cnn60_sp, cellsize = 250, what = "polygons")
#plot(cnn_grid)

# Add probability data to the 60%+ grid 
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid <- st_join(st_sf(cnn_grid), cnn60_sp)

# Visualize the grid cells with higher probability 
ggplot(cnn_grid) +
  geom_sf(aes(color = mound_probability))

# 333 raster cells are predicted to contain mounds with greater 
# than 60% likelihood. Archaeologists found 773 mounds in fraction of the area

# View the grid 
plotRGB(kaz, stretch= "lin");
plot(cnn_grid$geometry, add = TRUE, border = "white")

Field data

that we will need later for validation

# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")
## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
plot(mounds$geometry, main = "Mounds found through survey")

# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")
## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS:  WGS 84 / UTM zone 35N
plot(survey$geometry, main = "Area covered by survey")

# Extrapolate rough study area after subtracting outliers
far <- mounds %>% 
  st_is_within_distance(survey, 1500) %>% 
  lengths>0 
farmounds <- which(far==0)  # these are the mounds that are mostly far from survey area

# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))

# convex hull of survey polygons 
survey_ch <- st_convex_hull(st_union(survey$geometry))

# quickly see the difference in bounding boxes
plot(survey_ch, col = "green", main = "Convex hull around survey area (tight and extensive)");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)

Plot the field data

# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid$geometry, add = TRUE, border = "white");
legend("bottom", legend = "Grid cells with 60%+ probability of mounds (white squares) overlayed on \nthe satellite image and survey study area outlines with mounds (pink circles)")

Validation

The Tundzha Regional Archaeological project surveyed 85 sq km of the area included in the satellite imagery in 2009-2011, documenting the burial mounds within which create a dataset suitable for validation.

Area surveyed contains 773 documented mounds of various shapes and sizes. This area also contains 192 of 332 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?

Bring in mound attributes

# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
mounds <- mounds %>% 
  left_join(mounddata, by = c("TRAP_Code"="MoundID"))

Crop 60%+ cell grid to TRAP study area

Let’s see which of the grid cells with high probability of containing a mound are in the TRAP study area?

# Select gridcells that fall within TRAP study area
survey_grids60 <- st_intersection(survey_ch, cnn_grid) # 192 grid cells with 60%+
#plot(survey_grids60)

True Positives

Archaeological survey documented 773 mounds in the study area. Mounds from this dataset that fall in the 60%+ probability gridcells form the true positives and comprise the success of the CNN model.

# Check spatial overlap between 60%+ grid cells and all 773 mounds 
predicted <- st_intersection(mounds, cnn_grid)# %>% distinct()

# Which ones are not-predicted? 
`%nin%` = Negate(`%in%`)
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,] # 627 not predicted

head(predicted,2)  #146 unique features among 166 features caught via intersecton
## Simple feature collection with 2 features and 11 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 360896.8 ymin: 4713055 xmax: 361446.8 ymax: 4713591
## projected CRS:  WGS 84 / UTM zone 35N
##    Notes TRAP_Code Cluster      Lat     Long Condition Robbed Height LandUse
## 82 Mound      4066       3 42.55737 25.30552         3      1      5 Pasture
## 83 Mound      4067       3 42.56229 25.31209         2      1      2 Pasture
##      X1 mound_probability                 geometry
## 82 4748         0.8855298 POINT (360896.8 4713055)
## 83 5499         0.7194230 POINT (361446.8 4713591)
length(unique(predicted$TRAP_Code)) # 146 unique mounds
## [1] 146
length(unique(cnn_grid$X1)) #332 unique cells
## [1] 332
grids_n_mounds <- predicted %>% 
#  st_drop_geometry() %>% 
  group_by(X1) %>% 
  count()

length(unique(grids_n_mounds$X1)) # all the detected mounds are withing 51 grids, out of which one is labelled NA, so 50 it is
## [1] 50
hist(grids_n_mounds$n, breaks = 40, main = "Frequency of predicted mounds (146) per grid cell(n = 50")

# Summary: 50 out of 332 Grid cells with 60%+ probability contain between 1 and 29 detected mounds, which makes sense for the necropolis of little 10m diameter mounds in the NW

The previous chunk shows that 146 of 773 unique mounds appear in 51 of the 192 “survey” grids that have 60% or higher probability of mounds and are at least partially within the TRAP study area. The intersection actually shows 166 points, but some fall on the border of two gridcells and are double-counted. The entire satellite image contains 332 grids with 60%+ probability of mounds, 192 fall in the TRAP study area and 50 of these contain 146 mounds.

Add un/predicted column to mound data

# Add the information on prediction to mound dataset
mounds <- mounds %>% 
  mutate(predicted60 =  case_when(TRAP_Code %in% predicted$TRAP_Code ~ "yes",
                                  TRAP_Code %nin% predicted$TRAP_Code ~ "no"
                                  ))

# Look at the predicted and un=predicted mounds 
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]

plot(unpredicted$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted$geometry, col = "darkgreen", add= TRUE); 

Explore impact of height on detectability

Let’s look at whether mound Height is a factor impacting detectability. (it is a factor for visual inspection as higher mounds are more visible in sat img.)

# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>% 
  filter(Height>=2) # 247 obs

mounds %>% 
  group_by(predicted60) %>% 
  summarize(min = min(Height), max = max(Height))
## Simple feature collection with 2 features and 3 fields
## geometry type:  MULTIPOINT
## dimension:      XY
## bbox:           xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS:  WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
##   predicted60   min   max                                               geometry
##   <chr>       <dbl> <dbl>                                       <MULTIPOINT [m]>
## 1 no            0      20 ((352505.2 4725060), (352519.3 4725007), (352536.4 47~
## 2 yes           0.1    10 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted60, data = mounds, xlab = "Was mound detected?")

# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted60 == "no"], col = "pink", breaks  = 20,main = "Histogram of detected (yellow) and undetected (pink) mounds by height",
     xlab = "Height (m)");
hist(mounds$Height[mounds$predicted60 == "yes"], col = "yellow", add =TRUE, alpha = 0.5);

T-test on height

# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted60), data = mounds)
## 
##  Welch Two Sample t-test
## 
## data:  Height by as.factor(predicted60)
## t = 4.2302, df = 289.8, p-value = 3.136e-05
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
##  0.3825377 1.0482416
## sample estimates:
##  mean in group no mean in group yes 
##          1.877033          1.161644
str(t.test(Height ~ as.factor(predicted60), data = mounds))
## List of 10
##  $ statistic  : Named num 4.23
##   ..- attr(*, "names")= chr "t"
##  $ parameter  : Named num 290
##   ..- attr(*, "names")= chr "df"
##  $ p.value    : num 3.14e-05
##  $ conf.int   : num [1:2] 0.383 1.048
##   ..- attr(*, "conf.level")= num 0.95
##  $ estimate   : Named num [1:2] 1.88 1.16
##   ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
##  $ null.value : Named num 0
##   ..- attr(*, "names")= chr "difference in means between group no and group yes"
##  $ stderr     : num 0.169
##  $ alternative: chr "two.sided"
##  $ method     : chr "Welch Two Sample t-test"
##  $ data.name  : chr "Height by as.factor(predicted60)"
##  - attr(*, "class")= chr "htest"
# ... the answer seems to be yes with p at 3.14e-05

False Negatives

Given 146 mounds were predicted by CNN, where are the remaining 627? Let’s explore where the 627 missing mounds are and what probability has been assigned to the cells that contain them by CNN model.

# Create a grid of ALL the predictions to see where the remaining 627 unpredicted mounds are 
cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#plot(cnnall_sp)
cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons") 
#plot(cnnall_grid) # we can see that the cells overlap in the middle

cnnall_grid <- st_join(st_sf(cnnall_grid), cnnall_sp) # add attributes

# Trying to eliminate overlap in gridcells
# eliminating id duplicates is nonsensical as the two datasets have same ids, I  must aggregate spatially, e.g. st_union or st_difference, or relabel the datasets before merging, or crop to a shared non-overlapping border.
missing <- st_intersection(unpredicted, cnnall_grid) # 2158, clearly duplicates

# Check for duplicates
?distinct()
length(duplicated(missing$TRAP_Code)) # 2158 points to duplication
## [1] 2158
missing <- missing[!duplicated(missing$TRAP_Code),] #620 undetected mounds

So we have 620 missing mounds. How many grids are these located within?

# How many grids are the missing mounds in?
missing %>% 
  st_drop_geometry() %>% 
  group_by(X1) %>% 
  tally() %>% arrange(desc(n))
## # A tibble: 259 x 2
##       X1     n
##    <dbl> <int>
##  1  7551    28
##  2  9209    15
##  3  7366    13
##  4  9766    12
##  5  7367    11
##  6  7552    11
##  7  7934    10
##  8  8291    10
##  9  8474    10
## 10  8477    10
## # ... with 249 more rows
# The missing 620 mounds are contained within 259 grid cells of the satellite image 

# Let's see the distribution of missing mounds per grid cell
missing %>% 
  group_by(X1) %>% 
  count() %>% 
  ggplot()+ geom_histogram(aes(n))+ggtitle("Count of undetected mounds (620) per gridcell(n=259)") + theme_bw()

# some gridcells contain up to 28 undetected mounds. No surprise with the NW necropolis

Undetected mounds appear in 259 gridcells, which contain between 1 and 29 mounds, heavily skewed to the left.

What is the probability of the cells that contain undetected mounds?

# What is the mound_probability in these gridcells?
hist(missing$mound_probability, main = "Probability of gridcells containing undetected mounds")

Somehow 5 of the undetected mounds still have high mound-probability!? How have these snuck through? Maybe they are outside the study area?

Discrepancy in gridcell/mound probability

Probability in five undetected mounds is showing to be over 0.59 when it should be below! Let’s visually inspect what is going on and investigate:

# Ggplot of mounds with high prob and grid with high prob
# missing %>% 
#   filter(mound_probability>0.59) %>% 
#   ggplot()+
#   geom_sf(aes(color = mound_probability))+
#   geom_sf(data = cnn_grid, alpha = 0.5)

library(mapview)
missing %>% 
  filter(mound_probability>0.59) %>% 
  mapview(color = "red")+
 # mapview(cnn_grid, alpha = 0.5)+
  mapview(cnnall_grid, zcol= "mound_probability")

Visual inspection of the 5 undetected mounds with high probability shows they fall OUTSIDE the 60%+ gridcells. The problem here is as follows: These mounds have the high probability derived from gridcells, because the grid is composed of two grids, which partially overlap in the middle. Each set of overlapping cells have different values of mound likelihood! Mounds that sit in these gridcells have been duplicated and have both high and low probability values.

Ideally, I should be probably running the validation separately for Kaz-East and Kaz-West images as that is how the CNN values have been generated. However, it is interesting that the model generates different values for the same locations.

Most of the visualisation below is just a rough guide as the overlap in grids needs to be sorted.

Visualize results

# See the predicted mounds over grids with 60% likelihood of mound plus unpredicted mounds

library(mapview)
mapview(unpredicted, color = "orange", alpha= 0.5,
         map.types = c("Esri.WorldImagery", "CartoDB.Positron"),
         layer.name = "Undetected mounds")+
  mapview(grids_n_mounds, zcol = "n", at = c(1, 2, 5, 10, 15, 30),
          layer.name = "Detected mound no per gridcell")+
  mapview(survey_grids60, layer.name = "Gridcells with 60%+ probability")

Summary of success rates

146 mounds was detected out of 773, making the CNN predict (over 60% probability) ca 20% of mounds documented in the study area. Out of the 192 gridcells which are within the archaeological study area and have 60%+ probabilty of containing a mound, 50 actually contained between 1 and 29 mounds. Smaller mounds seem to enjoy higher detection rate according to the t-test, but that may not be the right metric. Let’s say that surprisingly, the large mounds are not detected as frequently as I would expect, and as human would detect them.

The remaining undetected 620 mounds were found in 259 gridcells, clustered from 1 to 28 specimen. Their exploration shows a problem with underlying grid overlap which cause reduplication of probability values (which are not identical) and proliferation of mound duplicates which each have different probability value! See section discrepancy above.

Once overlap is resolved, I can slide the probability downwards and downwards and see if I get higher veracity (more cells will be TPs) with higher probability or decrease missing values (fewer FNs) with lower probability cells.

What is the model succeeding at and what not?

Look at the NW cluster and see that many mounds sit on the border of polygons, and so get counted twice > ca 20 mounds, so the numbers of predicted are slightly exaggerated In NE we are missing a lot of very large 20m diameter+ mounds that are covered with scrub (look forested) In the south a lot of forest patches and beach boundaries are getting picked up that are false positives. Here the overlap may be at play !! REVISIT. All in all the model seems to pick on the bright signatures rather than round shapes and there is an odd combination of large forested lozenges and small bare spots picked up, but somehow round forested images with little excavated (bright) trenches on the south side is not getting picked up.

Additional visualisations

# Where are the mounds vis-a-vis the probability values
cnnall_grid %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  geom_sf(data = mounds) +
  ggtitle("Location of known mounds vis-a-vis the CNN predictions")

# See the 146/166 mound features predicted with 60%+ probability 
cnn_grid %>% 
 # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  geom_sf(data = mounds$geometry[mounds$predicted60 == "yes"], size = mounds$Height[mounds$predicted60 == "yes"], col = "lightgrey", alpha = 0.5) +
  ggtitle("Predicted mounds in Kazanlak")

# See the 627 mounds not caught by the cells with 60%+ probability 
cnn_grid %>% 
  # filter(mound_probability>0.5) %>% 
  ggplot()+
  geom_sf(aes(color = mound_probability))+
  scale_color_gradient(low = "green", 
                       high = "red",
                       "Mound probability")+
  #geom_sf(data = unpredicted$geometry)+
  geom_sf(data = mounds$geometry[mounds$predicted60 == "no"], size = mounds$Height[mounds$predicted60 == "no"], col = "lightgrey", alpha = 0.5)+
  ggtitle("Unpredicted mounds in Kazanlak")

Side-by-side views

These images take mound height into account and split the results into facets

# Cumulative facetted visual

ggplot()+
  geom_sf(data = cnn_grid, aes(fill = mound_probability)) +
  labs(fill = "Probability of mound")+
  # add mounds
  geom_sf(data = mounds, 
          aes(size = Height, col = "red",  alpha = 0.6)) +
  facet_wrap(~predicted60)+
  theme_bw() + 
  guides(color = FALSE,
         size = guide_legend(order = 1),
         fill = guide_legend(order = 2),
         alpha = FALSE)+
  ggtitle("Mounds located in gridcells with 60%+ probability (no=620, yes=146)")

par(mfrow = c(1,2))

# Look at the predicted and un=predicted mounds 
plotRGB(kaz, stretch= "lin");
legend("top", legend = "Detection differences by mound height")
plot(mounds$geometry[mounds$predicted60 == "no"], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("undetected", "detected",
                  "grids with 60%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors
     

# View Large mounds only
plotRGB(kaz, stretch= "lin");
plot(mounds$geometry[mounds$predicted60 == "no" & mounds$Height >=2], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"& mounds$Height >=2], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom",          # Position
       legend = c("undetected 2m+ mounds", "detected 2m+ mounds",
                  "grids with 60%+ mound probability"),  # Legend texts
       pch = c(1,1,0),       # Symbol shapes
       col = c("red","green", "lightgrey"))           # Line colors

## Other factors?? To be continued, … How many of the 60% prob grids are empty? Look at 70% and higher probability and see if a higher number of gridcells contain mounds.

Differentiate predictions by other criteria - perhaps RS dataset visibility in 2001 IKONOS?

Create a segregated density image of mound presence?

Biggest ommisions (false negatives) appear in NE where large mounds are present and biggest overcommits (false positives) are south of the reservoir, where there are morphological features but few mounds.